contribution

All the group member participated in all the two assignments, and after discussion, formed this report.

Assignment 1

Question1

#import the data
data<-read.table("prices-and-earnings.txt", header = TRUE, sep = "\t")
#select useful data
data<-data[,c(1,2,5,6,7,9,10,16,17,18,19)]

Question2

datascaled=scale(data[,-1])
plot_ly(x=colnames(data), y=rownames(data), 
        z=datascaled, type="heatmap", colors =colorRamp(c("yellow", "red")))%>%
  layout(title="heatmap of the data")

It is impossible to see clusters and outliers. As it is difficult to find areas with a similar color intensity or some observations have obviously different colors through human perception.So in the next step, we will reorder the data to get a new permutation dataset to see clusters and outliers.

Question3

#a) using Euclidian distance

#compute distance using Euclidian distance
rownames(datascaled)<-data[,1]
# in dist function, default method is Euclidian
Euc_rowdist<-dist(datascaled)
Euc_coldist<-dist(t(datascaled))
#seriate dissimilarity matrices using Hierarchical Clustering (HC) optimization
Euc_order1<-seriate(Euc_rowdist, "HC")
Euc_order2<-seriate(Euc_coldist, "HC")
Euc_ord1<-get_order(Euc_order1)
Euc_ord2<-get_order(Euc_order2)
Euc_reordmatr<-datascaled[rev(Euc_ord1),Euc_ord2]

Euc_plot<-plot_ly(x=colnames(Euc_reordmatr), y=rownames(Euc_reordmatr), 
        z=Euc_reordmatr, type="heatmap", colors =colorRamp(c("yellow", "red")))%>%
  layout(title="heatmap using Euclidian distance")

Euc_plot
#b) compute distance using 1- correlation distance

cor_rowdist<-as.dist(1-cor(t(datascaled)))
cor_coldist<-as.dist(1-cor(datascaled))
#seriate dissimilarity matrices using Hierarchical Clustering (HC) optimization
cor_order1<-seriate(cor_rowdist, "HC")
cor_order2<-seriate(cor_coldist, "HC")
cor_ord1<-get_order(cor_order1)
cor_ord2<-get_order(cor_order2)
cor_reordmatr<-datascaled[rev(cor_ord1),cor_ord2]

cor_plot<-plot_ly(x=colnames(cor_reordmatr), y=rownames(cor_reordmatr), 
        z=cor_reordmatr, type="heatmap", colors =colorRamp(c("yellow", "red")))%>%
  layout(title="heatmap using one minus correlation")

cor_plot

Compared to the plot using one minus correlation, it seems that the plot using Euclidean distance is easier to analyse. As we can see there is a clear boundary that divides the observations into four quadrants. So we might divide the cities above and below the center horizontal line into two clusters.

When plotting the heatmap, we calculate the distance(similarity) between the observations based on Euclidean distance, it is easier to check whether some observations are in the same clusters or not.

Question4

#Use Euclidian Distance with Traveling Salesman Problem (TSP) as the optimization algorithm

#seriate dissimilarity matrices using TSP optimization
TSP_Euc_order1<-seriate(Euc_rowdist, "TSP")
TSP_Euc_order2<-seriate(Euc_coldist, "TSP")
TSP_Euc_ord1<-get_order(TSP_Euc_order1)
TSP_Euc_ord2<-get_order(TSP_Euc_order2)
TSP_Euc_reordmatr<-datascaled[rev(TSP_Euc_ord1),TSP_Euc_ord2]

TSP_Euc_plot<-plot_ly(x=colnames(TSP_Euc_reordmatr), y=rownames(TSP_Euc_reordmatr), 
                  z=TSP_Euc_reordmatr, type="heatmap", colors =colorRamp(c("yellow", "red")))%>%
  layout(title="heatmap using TSP optimization")

TSP_Euc_plot
#compare objective function values using criterion() function

row_permu_HC_Hamil=criterion(x=Euc_rowdist, order=Euc_order1, method="Path_length")
row_permu_TSP_Hamil=criterion(x=Euc_rowdist, order=TSP_Euc_order1, method="Path_length")
row_permu_HC_Grad=criterion(x=Euc_rowdist, order=Euc_order1, method="Gradient_raw")
row_permu_TSP_Grad=criterion(x=Euc_rowdist, order=TSP_Euc_order1, method="Gradient_raw")
#combine the values in a table
data.frame(HC=c(row_permu_HC_Hamil,row_permu_HC_Grad),
           TSP=c(row_permu_TSP_Hamil,row_permu_TSP_Grad))
##                      HC        TSP
## Path_length    138.9674   120.8241
## Gradient_raw 30980.0000 23140.0000

By observing the two plots, using TSP optimization makes the boundary less obvious. It becomes harder to define the clusters and outliers. Considering the values returned from the criterion function. Since TSP has lower value for path_length and gradient_raw, it can be considered doing a better job in this context.

Question5

#create parallel coordinate plot using the template from lecture

paral_plot<-data %>%
  plot_ly(type = 'parcoords', 
          dimensions = list(
            list(label = "Food.Costs...", values = ~Food.Costs...),
            list(label = "iPhone.4S.hr.", values = ~iPhone.4S.hr.),
            list(label = "Clothing.Index", values = ~Clothing.Index),
            list(label = "Hours.Worked", values = ~Hours.Worked),
            list(label = "Wage.Net", values = ~Wage.Net),
            list(label = "Vacation.Days", values = ~Vacation.Days),
            list(label = "Big.Mac.min.", values = ~Big.Mac.min.),
            list(label = "Bread.kg.in.min.", values = ~Bread.kg.in.min.),
            list(label = "Rice.kg.in.min.", values = ~Rice.kg.in.min.),
            list(label = "Goods.and.Services...", values = ~Goods.and.Services...)
            
          )
  )

paral_plot
#add colors to the lines

col_paral_plot<-data %>%
  plot_ly(type = 'parcoords', 
          dimensions = list(
            list(label = "Food.Costs...", values = ~Food.Costs...),
            list(label = "iPhone.4S.hr.", values = ~iPhone.4S.hr.),
            list(label = "Clothing.Index", values = ~Clothing.Index),
            list(label = "Hours.Worked", values = ~Hours.Worked),
            list(label = "Wage.Net", values = ~Wage.Net),
            list(label = "Vacation.Days", values = ~Vacation.Days),
            list(label = "Big.Mac.min.", values = ~Big.Mac.min.),
            list(label = "Bread.kg.in.min.", values = ~Bread.kg.in.min.),
            list(label = "Rice.kg.in.min.", values = ~Rice.kg.in.min.),
            list(label = "Goods.and.Services...", values = ~Goods.and.Services...)
            ),
          line = list(color = ~Food.Costs...)
          
  )

col_paral_plot

From the plot, we can see the dataset can be divided into two clusters. Food.Cost, iphone.4S.hr, Big.Mac.min are important to define these two clusters.
Cluster1: Food.Cost value from 400 to 750 iphone.4S value from 22 to 50 Big.Mac.min value from 9 to 20
Cluster2: Food.Cost value from 186 to 400 iphone.4S value from 50 to 435 Big.Mac.min value from 20 to 84

We can define cluster1 as developed cities and cluster2 as backward cities. In cluster2, the food cost is much less than cluster1. However, the people in cluster2 need to work much more hours to buy an iphone4s and big mac. We may conclude that the people in cluster2 earn less money. It most likely a backward cities.

The most prominent outlier is the observation drawn using the yellow line. The food.Cost value is 927. The corresponding city is Tokyo. In Tokyo the food cost is as high as other developed cities, but the people working in Tokyo have to work more hours and have less vacation days compared to other developed cities.From this perspective, it is an outlier.

Question6

#use the template from lecture to create a Juxtaposed radarChart directly(ugly plot)
Euc_reordmatr_new<-as.data.frame(Euc_reordmatr)
stars(Euc_reordmatr,key.loc=c(17,-1), draw.segments=F, col.stars =rep("Yellow", nrow(Euc_reordmatr)))

#use ggplot2 to create the plot

Euc_reordmatr_new1<-Euc_reordmatr_new%>% mutate_all(funs(rescale))
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## ℹ Please use a list of either functions or lambdas:
## 
## # Simple named list: list(mean = mean, median = median)
## 
## # Auto named with `tibble::lst()`: tibble::lst(mean, median)
## 
## # Using lambdas list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
Euc_reordmatr_new1$name=rownames(Euc_reordmatr_new)
Euc_reordmatr_new2 <- Euc_reordmatr_new1%>%tidyr::gather(variable, value, -name, factor_key=T)%>%arrange(name)
p<-Euc_reordmatr_new2 %>%
  ggplot(aes(x=variable, y=value, group=name)) + 
  geom_polygon(fill="blue") + 
  coord_polar() + theme_bw() + facet_wrap(~ name) + 
  theme(axis.text.x = element_text(size = 5))+
  theme(strip.text = element_text(size = 20))
p

Cluster 1: Munich, Berlin, Barcelona, Frankfurt, Madrid (with more vertical apperance)
Cluster 2: Sofia, Vilnius, Warsaw, Athens, Ljubljana (with “r” shape)
Outlier: Caracas

Question7

From efficient perspective, radar chart is the best one to analyse the data because human eyes can pick up similar shapes very fast. From simplicity perspective, ordered heatmap using Euclidian distance is better, as all cities present in the same plot, and the boundary makes it easy to see which cities belong to which clusters.

Assignment 2

Setup

#Q2setup
q2data = read.table("adult.csv", sep = ",", header = F)
colname <- c("age",
             "workclass",
             "fnlwgt",
             "education",
             "education-num",
             "marital-status",
             "occupation",
             "relationship",
             "race",
             "sex",
             "capital-gain",
             "capital-loss",
             "hours-per-week",
             "native-country",
             "Income level")
colnames(q2data) <- colname

Question1

#q21
q21_scatter <- ggplot(q2data)+geom_point(aes(x = age ,y = `hours-per-week`,color =`Income level`))+ 
                labs(x="Age",y="Hours per week")+
                ggtitle("Hours per Week vs Age: Scatter Plot")
q21_scatter

By observing the above scatter plot, it can be conclude that overplotting is the biggest problem. It makes it hard to differentiate how different income levels are distributed.

q21_trellis <- ggplot(q2data)+geom_point(aes(x = age ,y = `hours-per-week`,color =`Income level`))+
                labs(x="Age",y="Hours per week")+ 
                ggtitle("Hours per Week vs Age: Trellis Plot")+
                facet_wrap(~`Income level`, labeller = "label_both") #template form course website

q21_trellis

By separating the plot into two income level groups, one can have a better look of how the different groups are distributed. For Income level lower (equal to) than 50K, the plot has higher density at the bottom left corner. This indicates that there are more observation that works less than 25 hours, and there are more people that is under 25 years old. It should also be noted there are fewer observations for Income level > 50K.

Question2

#q22
q22_density <- ggplot(q2data) +
  geom_density(aes(x = age, fill = `Income level`),alpha = 0.5) + #alpha for transparency
  ggtitle("Density plot of Age grouped by the Income level")+
  labs(x = "Age", y = "Density") 
q22_density

Here, one can see that the density of Income leve<=50K peaked around 25 years old, while Income level> 50K is around 40~50 years old. This indicates that young people often has lower income.

q22_trellis <- ggplot(q2data) +
  geom_density(aes(x = age, fill = `Income level`),alpha = 0.5) + #alpha for transparency
  ggtitle("Density plot of Age grouped by the Income level,  condition on Marital Status")+
  facet_wrap(~`marital-status`)
q22_trellis

Here, the distribution of different groups of marital status are presented. Most of the marital status are distributed rather similar with higher income level has slightly older age. However, “Never married” and “married spouse absent” where the income level <= 50K have younger distribution. This is as expected, especially for never-married-people, since younger people tend to be not married and have lower income.

Question3

#q23
filtered_data <- q2data[q2data$`capital-loss` != 0, ]
q23_3d_scatter <- plot_ly(filtered_data, x = ~`education-num`, y = ~age, z = ~`capital-loss`)%>%
  add_markers()
q23_3d_scatter

The 3D plot is created with default plotly setting. Again, the overplotting is the issue here, which makes it hard to analyze.

filtered_data$age_group <- cut_number(filtered_data$age, n = 6)
q23_trellis <- ggplot(filtered_data, aes(x = `education-num`, y = `capital-loss`)) +
  geom_tile(aes(fill = after_stat(density)), stat = "density2d")+
  facet_wrap(~`age_group`, labeller = "label_both")
q23_trellis

By observing the trellis plot, the capital loss for all age groups are similar, around 1500 to 2500. Also, the difference in education level is not significant across most age groups, except for the age group ranging from 17 to 29, which has slightly lower education compared to the other age groups.

Question4

#q24
#Cut
q24_data <- filtered_data
q24_data$age_group <- cut_number(q24_data$age, n = 4)

q24_cut <- ggplot(q24_data, aes(x=`education-num`, y=`capital-loss`)) +
  geom_point() +
  ggtitle("Trellis plot using cut_number() ")+
  facet_wrap(~`age_group`, labeller = "label_both")
#Shingles

#---template from course website---

Agerange<-lattice::equal.count(q24_data$age, number=4, overlap=0.1) #overlap is 10% 
L<-matrix(unlist(levels(Agerange)), ncol=2, byrow = T)
L1<-data.frame(Lower=L[,1],Upper=L[,2], Interval=factor(1:nrow(L)))
index=c()
Class=c()
for(i in 1:nrow(L)){
  Cl=paste("[", L1$Lower[i], ",", L1$Upper[i], "]", sep="")
  ind=which(q24_data$age>=L1$Lower[i] &q24_data$age<=L1$Upper[i])
  index=c(index,ind)
  Class=c(Class, rep(Cl, length(ind)))
}

q24_shingles_data<-q24_data[index,]
q24_shingles_data$Class<-as.factor(Class)

q24_shingles <- ggplot(q24_shingles_data, aes(x=`education-num`, y=`capital-loss`))+
  geom_point() +
  ggtitle("Trellis plot using Shingles")+
  facet_wrap(~Class, labeller = "label_both")
q24_cut

q24_shingles

There are little difference between using cut_number and 10% shingles here. As the extra data point do not result in different pattern, we can say that there is no advantage to use shingles in this context. In theory, using shingles can help avoid missing important pattern at the boundary using cut_number approach. However, the disadvantage will be depending on the overlap, same data points will appear in different group, hence increase the difficulty to interpret the plot.

Appendix

knitr::opts_chunk$set(echo = TRUE)
rm(list=ls())
library(ggplot2)
library(plotly)
library(dplyr)
library(seriation)
library(scales)
set.seed(12345)
#import the data
data<-read.table("prices-and-earnings.txt", header = TRUE, sep = "\t")
#select useful data
data<-data[,c(1,2,5,6,7,9,10,16,17,18,19)]
datascaled=scale(data[,-1])
plot_ly(x=colnames(data), y=rownames(data), 
        z=datascaled, type="heatmap", colors =colorRamp(c("yellow", "red")))%>%
  layout(title="heatmap of the data")
#a) using Euclidian distance

#compute distance using Euclidian distance
rownames(datascaled)<-data[,1]
# in dist function, default method is Euclidian
Euc_rowdist<-dist(datascaled)
Euc_coldist<-dist(t(datascaled))
#seriate dissimilarity matrices using Hierarchical Clustering (HC) optimization
Euc_order1<-seriate(Euc_rowdist, "HC")
Euc_order2<-seriate(Euc_coldist, "HC")
Euc_ord1<-get_order(Euc_order1)
Euc_ord2<-get_order(Euc_order2)
Euc_reordmatr<-datascaled[rev(Euc_ord1),Euc_ord2]

Euc_plot<-plot_ly(x=colnames(Euc_reordmatr), y=rownames(Euc_reordmatr), 
        z=Euc_reordmatr, type="heatmap", colors =colorRamp(c("yellow", "red")))%>%
  layout(title="heatmap using Euclidian distance")

Euc_plot


#b) compute distance using 1- correlation distance

cor_rowdist<-as.dist(1-cor(t(datascaled)))
cor_coldist<-as.dist(1-cor(datascaled))
#seriate dissimilarity matrices using Hierarchical Clustering (HC) optimization
cor_order1<-seriate(cor_rowdist, "HC")
cor_order2<-seriate(cor_coldist, "HC")
cor_ord1<-get_order(cor_order1)
cor_ord2<-get_order(cor_order2)
cor_reordmatr<-datascaled[rev(cor_ord1),cor_ord2]

cor_plot<-plot_ly(x=colnames(cor_reordmatr), y=rownames(cor_reordmatr), 
        z=cor_reordmatr, type="heatmap", colors =colorRamp(c("yellow", "red")))%>%
  layout(title="heatmap using one minus correlation")

cor_plot

#Use Euclidian Distance with Traveling Salesman Problem (TSP) as the optimization algorithm

#seriate dissimilarity matrices using TSP optimization
TSP_Euc_order1<-seriate(Euc_rowdist, "TSP")
TSP_Euc_order2<-seriate(Euc_coldist, "TSP")
TSP_Euc_ord1<-get_order(TSP_Euc_order1)
TSP_Euc_ord2<-get_order(TSP_Euc_order2)
TSP_Euc_reordmatr<-datascaled[rev(TSP_Euc_ord1),TSP_Euc_ord2]

TSP_Euc_plot<-plot_ly(x=colnames(TSP_Euc_reordmatr), y=rownames(TSP_Euc_reordmatr), 
                  z=TSP_Euc_reordmatr, type="heatmap", colors =colorRamp(c("yellow", "red")))%>%
  layout(title="heatmap using TSP optimization")

TSP_Euc_plot
#compare objective function values using criterion() function

row_permu_HC_Hamil=criterion(x=Euc_rowdist, order=Euc_order1, method="Path_length")
row_permu_TSP_Hamil=criterion(x=Euc_rowdist, order=TSP_Euc_order1, method="Path_length")
row_permu_HC_Grad=criterion(x=Euc_rowdist, order=Euc_order1, method="Gradient_raw")
row_permu_TSP_Grad=criterion(x=Euc_rowdist, order=TSP_Euc_order1, method="Gradient_raw")
#combine the values in a table
data.frame(HC=c(row_permu_HC_Hamil,row_permu_HC_Grad),
           TSP=c(row_permu_TSP_Hamil,row_permu_TSP_Grad))

#create parallel coordinate plot using the template from lecture

paral_plot<-data %>%
  plot_ly(type = 'parcoords', 
          dimensions = list(
            list(label = "Food.Costs...", values = ~Food.Costs...),
            list(label = "iPhone.4S.hr.", values = ~iPhone.4S.hr.),
            list(label = "Clothing.Index", values = ~Clothing.Index),
            list(label = "Hours.Worked", values = ~Hours.Worked),
            list(label = "Wage.Net", values = ~Wage.Net),
            list(label = "Vacation.Days", values = ~Vacation.Days),
            list(label = "Big.Mac.min.", values = ~Big.Mac.min.),
            list(label = "Bread.kg.in.min.", values = ~Bread.kg.in.min.),
            list(label = "Rice.kg.in.min.", values = ~Rice.kg.in.min.),
            list(label = "Goods.and.Services...", values = ~Goods.and.Services...)
            
          )
  )

paral_plot

#add colors to the lines

col_paral_plot<-data %>%
  plot_ly(type = 'parcoords', 
          dimensions = list(
            list(label = "Food.Costs...", values = ~Food.Costs...),
            list(label = "iPhone.4S.hr.", values = ~iPhone.4S.hr.),
            list(label = "Clothing.Index", values = ~Clothing.Index),
            list(label = "Hours.Worked", values = ~Hours.Worked),
            list(label = "Wage.Net", values = ~Wage.Net),
            list(label = "Vacation.Days", values = ~Vacation.Days),
            list(label = "Big.Mac.min.", values = ~Big.Mac.min.),
            list(label = "Bread.kg.in.min.", values = ~Bread.kg.in.min.),
            list(label = "Rice.kg.in.min.", values = ~Rice.kg.in.min.),
            list(label = "Goods.and.Services...", values = ~Goods.and.Services...)
            ),
          line = list(color = ~Food.Costs...)
          
  )

col_paral_plot
#use the template from lecture to create a Juxtaposed radarChart directly(ugly plot)
Euc_reordmatr_new<-as.data.frame(Euc_reordmatr)
stars(Euc_reordmatr,key.loc=c(17,-1), draw.segments=F, col.stars =rep("Yellow", nrow(Euc_reordmatr)))

#use ggplot2 to create the plot

Euc_reordmatr_new1<-Euc_reordmatr_new%>% mutate_all(funs(rescale))
Euc_reordmatr_new1$name=rownames(Euc_reordmatr_new)
Euc_reordmatr_new2 <- Euc_reordmatr_new1%>%tidyr::gather(variable, value, -name, factor_key=T)%>%arrange(name)
p<-Euc_reordmatr_new2 %>%
  ggplot(aes(x=variable, y=value, group=name)) + 
  geom_polygon(fill="blue") + 
  coord_polar() + theme_bw() + facet_wrap(~ name) + 
  theme(axis.text.x = element_text(size = 5))+
  theme(strip.text = element_text(size = 20))
p
#Q2setup
q2data = read.table("adult.csv", sep = ",", header = F)
colname <- c("age",
             "workclass",
             "fnlwgt",
             "education",
             "education-num",
             "marital-status",
             "occupation",
             "relationship",
             "race",
             "sex",
             "capital-gain",
             "capital-loss",
             "hours-per-week",
             "native-country",
             "Income level")
colnames(q2data) <- colname

#q21
q21_scatter <- ggplot(q2data)+geom_point(aes(x = age ,y = `hours-per-week`,color =`Income level`))+ 
                labs(x="Age",y="Hours per week")+
                ggtitle("Hours per Week vs Age: Scatter Plot")
q21_scatter

q21_trellis <- ggplot(q2data)+geom_point(aes(x = age ,y = `hours-per-week`,color =`Income level`))+
                labs(x="Age",y="Hours per week")+ 
                ggtitle("Hours per Week vs Age: Trellis Plot")+
                facet_wrap(~`Income level`, labeller = "label_both") #template form course website

q21_trellis
#q22
q22_density <- ggplot(q2data) +
  geom_density(aes(x = age, fill = `Income level`),alpha = 0.5) + #alpha for transparency
  ggtitle("Density plot of Age grouped by the Income level")+
  labs(x = "Age", y = "Density") 
q22_density

q22_trellis <- ggplot(q2data) +
  geom_density(aes(x = age, fill = `Income level`),alpha = 0.5) + #alpha for transparency
  ggtitle("Density plot of Age grouped by the Income level,  condition on Marital Status")+
  facet_wrap(~`marital-status`)
q22_trellis
#q23
filtered_data <- q2data[q2data$`capital-loss` != 0, ]
q23_3d_scatter <- plot_ly(filtered_data, x = ~`education-num`, y = ~age, z = ~`capital-loss`)%>%
  add_markers()
q23_3d_scatter
filtered_data$age_group <- cut_number(filtered_data$age, n = 6)
q23_trellis <- ggplot(filtered_data, aes(x = `education-num`, y = `capital-loss`)) +
  geom_tile(aes(fill = after_stat(density)), stat = "density2d")+
  facet_wrap(~`age_group`, labeller = "label_both")
q23_trellis
#q24
#Cut
q24_data <- filtered_data
q24_data$age_group <- cut_number(q24_data$age, n = 4)

q24_cut <- ggplot(q24_data, aes(x=`education-num`, y=`capital-loss`)) +
  geom_point() +
  ggtitle("Trellis plot using cut_number() ")+
  facet_wrap(~`age_group`, labeller = "label_both")
#Shingles

#---template from course website---

Agerange<-lattice::equal.count(q24_data$age, number=4, overlap=0.1) #overlap is 10% 
L<-matrix(unlist(levels(Agerange)), ncol=2, byrow = T)
L1<-data.frame(Lower=L[,1],Upper=L[,2], Interval=factor(1:nrow(L)))
index=c()
Class=c()
for(i in 1:nrow(L)){
  Cl=paste("[", L1$Lower[i], ",", L1$Upper[i], "]", sep="")
  ind=which(q24_data$age>=L1$Lower[i] &q24_data$age<=L1$Upper[i])
  index=c(index,ind)
  Class=c(Class, rep(Cl, length(ind)))
}

q24_shingles_data<-q24_data[index,]
q24_shingles_data$Class<-as.factor(Class)

q24_shingles <- ggplot(q24_shingles_data, aes(x=`education-num`, y=`capital-loss`))+
  geom_point() +
  ggtitle("Trellis plot using Shingles")+
  facet_wrap(~Class, labeller = "label_both")
q24_cut
q24_shingles